{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\hasee\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\h5py\\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.layers import Dropout\n",
    "from keras.layers import LSTM\n",
    "from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau\n",
    "from keras.utils import np_utils\n",
    "from keras import optimizers\n",
    "import keras\n",
    "from keras import layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1214016"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=pd.read_csv('../dataset/five_star_restaurants_reviews_only.csv')\n",
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.replace({r'\\+': ''}, regex=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "418955"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#train on shorter reviews.  Already lots of data, easier to train on shorter ones too\n",
    "mask = (df['text'].str.len() < 251) \n",
    "df2 = df.loc[mask]\n",
    "len(df2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#shuffle the order of the reviews so we don't train on 100 Subway ones in a row\n",
    "short_reviews=df2.sample(frac=1).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#only run this the first time, it will save a txt file on your computer\n",
    "filename='../dataset/short_reviews_shuffle.txt'\n",
    "short_reviews.to_csv(filename, header=None, index=None, sep=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Corpus length: 72662807\n"
     ]
    }
   ],
   "source": [
    "text = open('../dataset/short_reviews_shuffle.txt').read()\n",
    "print('Corpus length:', len(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unique characters: 95\n"
     ]
    }
   ],
   "source": [
    "# List of unique characters in the corpus\n",
    "\n",
    "chars = sorted(list(set(text)))\n",
    "print('Unique characters:', len(chars))\n",
    "# Dictionary mapping unique characters to their index in `chars`\n",
    "char_indices = dict((char, chars.index(char)) for char in chars)\n",
    "\n",
    "maxlen=60\n",
    "step=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'\\n': 0,\n",
       " ' ': 1,\n",
       " '!': 2,\n",
       " '\"': 3,\n",
       " '#': 4,\n",
       " '$': 5,\n",
       " '%': 6,\n",
       " '&': 7,\n",
       " \"'\": 8,\n",
       " '(': 9,\n",
       " ')': 10,\n",
       " '*': 11,\n",
       " ',': 12,\n",
       " '-': 13,\n",
       " '.': 14,\n",
       " '/': 15,\n",
       " '0': 16,\n",
       " '1': 17,\n",
       " '2': 18,\n",
       " '3': 19,\n",
       " '4': 20,\n",
       " '5': 21,\n",
       " '6': 22,\n",
       " '7': 23,\n",
       " '8': 24,\n",
       " '9': 25,\n",
       " ':': 26,\n",
       " ';': 27,\n",
       " '<': 28,\n",
       " '=': 29,\n",
       " '>': 30,\n",
       " '?': 31,\n",
       " '@': 32,\n",
       " 'A': 33,\n",
       " 'B': 34,\n",
       " 'C': 35,\n",
       " 'D': 36,\n",
       " 'E': 37,\n",
       " 'F': 38,\n",
       " 'G': 39,\n",
       " 'H': 40,\n",
       " 'I': 41,\n",
       " 'J': 42,\n",
       " 'K': 43,\n",
       " 'L': 44,\n",
       " 'M': 45,\n",
       " 'N': 46,\n",
       " 'O': 47,\n",
       " 'P': 48,\n",
       " 'Q': 49,\n",
       " 'R': 50,\n",
       " 'S': 51,\n",
       " 'T': 52,\n",
       " 'U': 53,\n",
       " 'V': 54,\n",
       " 'W': 55,\n",
       " 'X': 56,\n",
       " 'Y': 57,\n",
       " 'Z': 58,\n",
       " '[': 59,\n",
       " '\\\\': 60,\n",
       " ']': 61,\n",
       " '^': 62,\n",
       " '_': 63,\n",
       " '`': 64,\n",
       " 'a': 65,\n",
       " 'b': 66,\n",
       " 'c': 67,\n",
       " 'd': 68,\n",
       " 'e': 69,\n",
       " 'f': 70,\n",
       " 'g': 71,\n",
       " 'h': 72,\n",
       " 'i': 73,\n",
       " 'j': 74,\n",
       " 'k': 75,\n",
       " 'l': 76,\n",
       " 'm': 77,\n",
       " 'n': 78,\n",
       " 'o': 79,\n",
       " 'p': 80,\n",
       " 'q': 81,\n",
       " 'r': 82,\n",
       " 's': 83,\n",
       " 't': 84,\n",
       " 'u': 85,\n",
       " 'v': 86,\n",
       " 'w': 87,\n",
       " 'x': 88,\n",
       " 'y': 89,\n",
       " 'z': 90,\n",
       " '{': 91,\n",
       " '|': 92,\n",
       " '}': 93,\n",
       " '~': 94}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "char_indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#This get Data From Chunk is necessary to process large data sets like the one we have\n",
    "#If you're using a sample less than 1 million characters you can train the whole thing at once\n",
    "\n",
    "def getDataFromChunk(txtChunk, maxlen=60, step=1):\n",
    "    sentences = []\n",
    "    next_chars = []\n",
    "    for i in range(0, len(txtChunk) - maxlen, step):\n",
    "        sentences.append(txtChunk[i : i + maxlen])\n",
    "        next_chars.append(txtChunk[i + maxlen])\n",
    "    print('nb sequences:', len(sentences))\n",
    "    print('Vectorization...')\n",
    "    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)\n",
    "    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)\n",
    "    for i, sentence in enumerate(sentences):\n",
    "        for t, char in enumerate(sentence):\n",
    "            X[i, t, char_indices[char]] = 1\n",
    "            y[i, char_indices[next_chars[i]]] = 1\n",
    "    return [X, y]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nb sequences: 110\n",
      "Vectorization...\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "170"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk = 'Always love this place and the prices are very reasonable! I am never disappointed. This get Data From Chunk is necessary to process large data sets like the one we have.'\n",
    "x,y = getDataFromChunk(chunk)\n",
    "len(chunk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(110, 60, 95)"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(110, 95)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential()\n",
    "model.add(layers.LSTM(1024, input_shape=(maxlen, len(chars)),return_sequences=True))\n",
    "model.add(layers.LSTM(1024, input_shape=(maxlen, len(chars))))\n",
    "model.add(layers.Dense(len(chars), activation='softmax'))\n",
    "#model.load_weights(\"Feb-22-all-00-0.8265.hdf5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = keras.optimizers.Adam(lr=0.001)\n",
    "model.compile(loss='categorical_crossentropy', optimizer=optimizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this saves the weights everytime they improve so you can let it train.  Also learning rate decay\n",
    "filepath=\"Feb-22-all-{epoch:02d}-{loss:.4f}.hdf5\"\n",
    "checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')\n",
    "reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5,\n",
    "              patience=1, min_lr=0.00001)\n",
    "callbacks_list = [checkpoint, reduce_lr]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sample(preds, temperature=1.0):\n",
    "    '''\n",
    "    Generate some randomness with the given preds\n",
    "    which is a list of numbers, if the temperature\n",
    "    is very small, it will always pick the index\n",
    "    with highest pred value\n",
    "    '''\n",
    "    preds = np.asarray(preds).astype('float64')\n",
    "    preds = np.log(preds) / temperature\n",
    "    exp_preds = np.exp(preds)\n",
    "    preds = exp_preds / np.sum(exp_preds)\n",
    "    probas = np.random.multinomial(1, preds, 1)\n",
    "    return np.argmax(probas)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train model\n",
    "ETA\n",
    "```\n",
    "72662807 (total chars) \n",
    "/90000 (chars per chunk) * 219 (seconds per chunk, one epoch) * 20 (times of epochs) \n",
    "/ 60 (s/min) / 60 (min/h) / 24 (h/day)\n",
    "= 40.928 days\n",
    "\n",
    "72662807\n",
    "/90000* 219* 20\n",
    "/ 60 / 60/ 24\n",
    "= 40.928 days\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "--------------------------------------------------\n",
      "Iteration 1\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 219s 2ms/step - loss: 2.5472\n",
      "\n",
      "Epoch 00001: loss improved from inf to 2.54718, saving model to Feb-22-all-01-2.5472.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 218s 2ms/step - loss: 1.7866\n",
      "\n",
      "Epoch 00001: loss improved from 2.54718 to 1.78660, saving model to Feb-22-all-01-1.7866.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 217s 2ms/step - loss: 1.5162\n",
      "\n",
      "Epoch 00001: loss improved from 1.78660 to 1.51617, saving model to Feb-22-all-01-1.5162.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 217s 2ms/step - loss: 1.3857\n",
      "\n",
      "Epoch 00001: loss improved from 1.51617 to 1.38568, saving model to Feb-22-all-01-1.3857.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 216s 2ms/step - loss: 1.3079\n",
      "\n",
      "Epoch 00001: loss improved from 1.38568 to 1.30785, saving model to Feb-22-all-01-1.3079.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 216s 2ms/step - loss: 1.2753\n",
      "\n",
      "Epoch 00001: loss improved from 1.30785 to 1.27533, saving model to Feb-22-all-01-1.2753.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 216s 2ms/step - loss: 1.2341\n",
      "\n",
      "Epoch 00001: loss improved from 1.27533 to 1.23409, saving model to Feb-22-all-01-1.2341.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 217s 2ms/step - loss: 1.1849\n",
      "\n",
      "Epoch 00001: loss improved from 1.23409 to 1.18494, saving model to Feb-22-all-01-1.1849.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 217s 2ms/step - loss: 1.1828\n",
      "\n",
      "Epoch 00001: loss improved from 1.18494 to 1.18279, saving model to Feb-22-all-01-1.1828.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 218s 2ms/step - loss: 1.1584\n",
      "\n",
      "Epoch 00001: loss improved from 1.18279 to 1.15837, saving model to Feb-22-all-01-1.1584.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 217s 2ms/step - loss: 1.1284\n",
      "\n",
      "Epoch 00001: loss improved from 1.15837 to 1.12840, saving model to Feb-22-all-01-1.1284.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 218s 2ms/step - loss: 1.1178\n",
      "\n",
      "Epoch 00001: loss improved from 1.12840 to 1.11777, saving model to Feb-22-all-01-1.1178.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 217s 2ms/step - loss: 1.0965\n",
      "\n",
      "Epoch 00001: loss improved from 1.11777 to 1.09651, saving model to Feb-22-all-01-1.0965.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 217s 2ms/step - loss: 1.0858\n",
      "\n",
      "Epoch 00001: loss improved from 1.09651 to 1.08584, saving model to Feb-22-all-01-1.0858.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 218s 2ms/step - loss: 1.0927\n",
      "\n",
      "Epoch 00001: loss did not improve\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 220s 2ms/step - loss: 1.0908\n",
      "\n",
      "Epoch 00001: loss did not improve\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 221s 2ms/step - loss: 1.0570\n",
      "\n",
      "Epoch 00001: loss improved from 1.08584 to 1.05702, saving model to Feb-22-all-01-1.0570.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 217s 2ms/step - loss: 1.0580\n",
      "\n",
      "Epoch 00001: loss did not improve\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 217s 2ms/step - loss: 1.0509\n",
      "\n",
      "Epoch 00001: loss improved from 1.05702 to 1.05089, saving model to Feb-22-all-01-1.0509.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "89940/89940 [==============================] - 219s 2ms/step - loss: 1.0329\n",
      "\n",
      "Epoch 00001: loss improved from 1.05089 to 1.03290, saving model to Feb-22-all-01-1.0329.hdf5\n",
      "nb sequences: 89940\n",
      "Vectorization...\n",
      "Epoch 1/1\n",
      "86144/89940 [===========================>..] - ETA: 12s - loss: 1.0595"
     ]
    }
   ],
   "source": [
    "#This trains the model batching from the text file\n",
    "#every epoch it prints out 300 characters at different \"temperatures\"\n",
    "#temperature controls how random the characters sample: more temperature== more crazy (but often better) text\n",
    "for iteration in range(1, 20):\n",
    "    print()\n",
    "    print('-' * 50)\n",
    "    print('Iteration', iteration)\n",
    "    with open(\"../dataset/short_reviews_shuffle.txt\") as f:\n",
    "        for chunk in iter(lambda: f.read(90000), \"\"):\n",
    "            X, y = getDataFromChunk(chunk)\n",
    "            model.fit(X, y, batch_size=128, epochs=1, callbacks=callbacks_list)\n",
    "    \n",
    "     # Select a text seed at random\n",
    "    start_index = random.randint(0, len(text) - maxlen - 1)\n",
    "    generated_text = text[start_index: start_index + maxlen]\n",
    "    print('--- Generating with seed: \"' + generated_text + '\"')\n",
    "\n",
    "    for temperature in [0.5, 0.8, 1.0]:\n",
    "        print('------ temperature:', temperature)\n",
    "        sys.stdout.write(generated_text)\n",
    "\n",
    "        # We generate 300 characters\n",
    "        for i in range(300):\n",
    "            sampled = np.zeros((1, maxlen, len(chars)))\n",
    "            for t, char in enumerate(generated_text):\n",
    "                sampled[0, t, char_indices[char]] = 1.\n",
    "\n",
    "            preds = model.predict(sampled, verbose=0)[0]\n",
    "            next_index = sample(preds, temperature)\n",
    "            next_char = chars[next_index]\n",
    "\n",
    "            generated_text += next_char\n",
    "            generated_text = generated_text[1:]\n",
    "\n",
    "            sys.stdout.write(next_char)\n",
    "            sys.stdout.flush()\n",
    "        print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#USE THIS TO TEST YOUR OUTPUT WHEN NOT/DONE TRAINING\n",
    "\n",
    "# Select a text seed at random\n",
    "start_index = random.randint(0, len(text) - maxlen - 1)\n",
    "generated_text = text[start_index: start_index + maxlen]\n",
    "print('--- Generating with seed: \"' + generated_text + '\"')\n",
    "\n",
    "for temperature in [0.5, 0.8, 1.0]:\n",
    "    print('------ temperature:', temperature)\n",
    "    sys.stdout.write(generated_text)\n",
    "\n",
    "        # We generate 300 characters\n",
    "    for i in range(300):\n",
    "        sampled = np.zeros((1, maxlen, len(chars)))\n",
    "        for t, char in enumerate(generated_text):\n",
    "            sampled[0, t, char_indices[char]] = 1.\n",
    "\n",
    "        preds = model.predict(sampled, verbose=0)[0]\n",
    "        next_index = sample(preds, temperature)\n",
    "        next_char = chars[next_index]\n",
    "\n",
    "        generated_text += next_char\n",
    "        generated_text = generated_text[1:]\n",
    "\n",
    "        sys.stdout.write(next_char)\n",
    "        sys.stdout.flush()\n",
    "    print()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "12px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
